print(Sys.time())

options(stringsAsFactors=F)


## suppressPackageStartupMessages(library(xts))
## suppressPackageStartupMessages(library(strucchange))
suppressPackageStartupMessages(library(parrot))
suppressPackageStartupMessages(library(tidyr))
## suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(xtable))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(irr))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(pryr))
suppressPackageStartupMessages(library(tidytext))

## glove_embeddings <- FALSE

if (!glove_embeddings) {
    load("data/code_test_labels_2015_replicate_psrm.RData")

} else {
    load("data/code_test_labels_glove_2015_replicate_psrm.RData")
    ordered_docs_short <- glove_meta_short
}

ordered_docs_short$vote <- grepl(
    "vote|voting|election|support",
    ordered_docs_short$tweet_text,
    ignore.case=T
)
#|clinton|hillary|donald|trump # same results more conservative content
ordered_docs_short$not <- grepl(
    "not|n't|boycott|sit out|fuck|truth|rigged|before|illegal|deserv",
    ordered_docs_short$tweet_text,
    ignore.case=T
)

doctidy <- ordered_docs_short[,c("tweetid","tweet_text")] %>%
    unnest_tokens(word, tweet_text) %>%
    inner_join(get_sentiments("afinn")) %>%
    group_by(tweetid) %>%
    summarise(
        sentiment = mean(score), n_words = n() #value
    )
##
ordered_docs_short <- left_join(ordered_docs_short, doctidy, by="tweetid")



ordered_docs <- ordered_docs_short

## blklivesmatter: 1604931252
## blm co-founders
## aliciagarza: 57039392
## OsopePatrisse: 84801794
## opalayo: 51546100

## shaunking: 755113
## deray: 29417304 # 926
## sample(subset(ordered_docs, retweet_userid %in% 29417304)$tweet_text, 10)
## MsPackyetti: 239509917 # 318
## Nettaaaaaaaa: 1291770157 # 466
## janaya_khan: 4749191209
## TefPoe: 40293464
blm_and_activists <- c(
    "1604931252","57039392","84801794","51546100",
    "755113","29417304","239509917","1291770157","4749191209"
)
blm_activists <- c("1604931252","57039392","84801794","51546100")
hillary_clinton <- "1339835893"
donald_trump <- "25073877"

ordered_docs <- ordered_docs %>%
    mutate(
        official_blm = retweet_userid %in% 1604931252,
        blm_and_activists = retweet_userid %in% blm_and_activists
    )

ordered_docs <- ordered_docs %>%
    mutate(tweet_date = as.Date(substr(tweet_time, 1, 10)))

if (file.exists("data/blklivesmatter_tweets.csv")) {

blm_tweets <- read.csv(
    "data/blklivesmatter_tweets.csv",
    colClasses=rep("character",34)
)

ordered_docs <- ordered_docs %>%
    mutate(
        official_blm_in_twint = (retweet_userid %in% 1604931252) &
            retweet_tweetid %in% blm_tweets$id
    )

## ## summarize actual BLM
summary(
    subset(
        ordered_docs,
        tweet_date >= "2016-01-01" & tweet_date < "2016-11-08"
    )$official_blm_in_twint
) #same
length(
    unique(
        subset(
            ordered_docs,
            tweet_date >= "2016-01-01" & tweet_date < "2016-11-08"
            & official_blm_in_twint
        )$retweet_tweetid
    )
)
nrow(subset(blm_tweets, date >= "2016-01-01" & date < "2016-11-08"))

summary(
    subset(
        ordered_docs,
        tweet_date >= "2016-10-01" & tweet_date < "2016-11-08"
    )$official_blm_in_twint
) #same
nrow(subset(blm_tweets, date >= "2016-10-01" & date < "2016-11-08"))

}



if (!glove_embeddings) {

#### hand label models
    source("code/10a_plot_tweet_labels_over_time_table_hand_label_models.R")

#### model calibration and fit; hand label IRR
    source("code/10b_plot_tweet_labels_over_time_figures_model_fit_calibration_and_label_evaluations.R")

    }


####  ####  ####
####  ####  ####
####  ####  ####
####  ####  ####

## aggregate

library(lubridate)
month_midpoint <- function(.date) {
    .dates <- floor_date(as.Date(.date), "month") +
            (
                ceiling_date(as.Date(.date), "month") -
                floor_date(as.Date(.date), "month")
            ) / 2 - 1
    .dates[.dates=="2016-11-15"] <- as.Date("2016-11-04")
    return(.dates)
}

aggregated_tweets <- subset(
    ordered_docs,
    !is.na(social_justice_pred)
) %>%
    mutate(
        tweet_date = lubridate::floor_date(as.Date(substr(tweet_time, 1, 10)),unit="week")
    ) %>%
    ungroup() %>%
    group_by(account_category, tweet_date) %>%
    summarise(
        ## average of probabilities
        social_justice_p = mean(social_justice_pred, na.rm=T),
        entertainment_p = mean(entertainment_pred, na.rm=T),
        politics_p = mean(politics_pred, na.rm=T),
        ## thresholding
        social_justice_a = mean(social_justice>=0.5, na.rm=T),
        entertainment_a = mean(entertainment>=0.5, na.rm=T),
        politics_a = mean(politics>=0.5, na.rm=T),
        ## sums
        social_justice_sum = sum(social_justice_pred, na.rm=T),
        entertainment_sum = sum(entertainment_pred, na.rm=T),
        politics_sum = sum(politics_pred, na.rm=T),
        social_justice_sum_a = sum(social_justice>=0.5, na.rm=T),
        entertainment_sum_a = sum(entertainment>=0.5, na.rm=T),
        politics_sum_a = sum(politics>=0.5, na.rm=T),
        ## counts
        n_tweets = n(),
        n_handcoded_tweets = sum(!is.na(social_justice)),
        n_vote = sum(vote),
        n_no_vote = sum((vote & not)),
        n_vote_any_sent = sum(!is.na(sentiment) & vote),
        n_vote_neg_sent = sum(!is.na(sentiment) & sentiment < 0 & vote)
    ) %>%
    mutate(
        sum_any_a = social_justice_a + entertainment_a + politics_a,
        sum_pny_p = social_justice_p + entertainment_p + politics_p,
        )

aggregated_tweets_center_mean <- subset(
    ordered_docs,
    !is.na(social_justice_pred)
) %>%
    mutate(
        tweet_date = lubridate::floor_date(as.Date(substr(tweet_time, 1, 10)),unit="week")
    ) %>%
    group_by(userid) %>%
    mutate(
        ## demean at user level
        social_justice_pred = scale(social_justice_pred, scale = F),
        entertainment_pred = scale(entertainment_pred, scale = F),
        politics_pred = scale(politics_pred, scale = F)
    ) %>%
    ungroup() %>%
    group_by(account_category, tweet_date) %>%
    summarise(
        social_justice_p = mean(social_justice_pred, na.rm=T),
        entertainment_p = mean(entertainment_pred, na.rm=T),
        politics_p = mean(politics_pred, na.rm=T),
        social_justice_a = mean(social_justice>=0.5, na.rm=T),
        entertainment_a = mean(entertainment>=0.5, na.rm=T),
        politics_a = mean(politics>=0.5, na.rm=T),
        social_justice_sum = sum(social_justice_pred, na.rm=T),
        entertainment_sum = sum(entertainment_pred, na.rm=T),
        politics_sum = sum(politics_pred, na.rm=T),
        social_justice_sum_a = sum(social_justice>=0.5, na.rm=T),
        entertainment_sum_a = sum(entertainment>=0.5, na.rm=T),
        politics_sum_a = sum(politics>=0.5, na.rm=T),
        n_tweets = n(),
        n_handcoded_tweets = sum(!is.na(social_justice)),
        n_vote = sum(vote),
        n_no_vote = sum(vote & not),
        n_vote_any_sent = sum(!is.na(sentiment) & vote),
        n_vote_neg_sent = sum(!is.na(sentiment) & sentiment < 0 & vote)
    ) %>%
    mutate(
        sum_any_a = social_justice_a + entertainment_a + politics_a,
        sum_pny_p = social_justice_p + entertainment_p + politics_p,
        )

aggregated_tweets_m <- subset(
    ordered_docs,
    !is.na(social_justice_pred)
) %>%
    mutate(
        ## monthly
        tweet_date = month_midpoint(as.Date(substr(tweet_time, 1, 10)))
    ) %>%
    ungroup() %>%
    group_by(account_category, tweet_date) %>%
    summarise(
        social_justice_p = mean(social_justice_pred, na.rm=T),
        entertainment_p = mean(entertainment_pred, na.rm=T),
        politics_p = mean(politics_pred, na.rm=T),
        social_justice_a = mean(social_justice>=0.5, na.rm=T),
        entertainment_a = mean(entertainment>=0.5, na.rm=T),
        politics_a = mean(politics>=0.5, na.rm=T),
        social_justice_sum = sum(social_justice_pred, na.rm=T),
        entertainment_sum = sum(entertainment_pred, na.rm=T),
        politics_sum = sum(politics_pred, na.rm=T),
        social_justice_sum_a = sum(social_justice>=0.5, na.rm=T),
        entertainment_sum_a = sum(entertainment>=0.5, na.rm=T),
        politics_sum_a = sum(politics>=0.5, na.rm=T),
        n_tweets = n(),
        n_handcoded_tweets = sum(!is.na(social_justice)),
        n_vote = sum(vote),
        n_no_vote = sum(vote & not),
        n_vote_any_sent = sum(!is.na(sentiment) & vote),
        n_vote_neg_sent = sum(!is.na(sentiment) & sentiment < 0 & vote)
    ) %>%
    mutate(
        sum_any_a = social_justice_a + entertainment_a + politics_a,
        sum_pny_p = social_justice_p + entertainment_p + politics_p,
    )


aggregated_tweets_left <- subset(
    aggregated_tweets,
    account_category == "LeftTroll"
    & tweet_date >= "2014-06-01"
)
aggregated_tweets_right <- subset(
    aggregated_tweets,
    account_category == "RightTroll"
    & tweet_date >= "2014-06-01"
)
##

## demeaned
aggregated_tweets_center_mean_left <- subset(
    aggregated_tweets_center_mean,
    account_category == "LeftTroll"
)
aggregated_tweets_center_mean_right <- subset(
    aggregated_tweets_center_mean,
    account_category == "RightTroll"
)
##

## monthly
aggregated_tweets_m_left <- subset(
    aggregated_tweets_m,
    account_category == "LeftTroll"
    & tweet_date >= "2014-06-01"
)
aggregated_tweets_m_right <- subset(
    aggregated_tweets_m,
    account_category == "RightTroll"
    & tweet_date >= "2014-06-01"
)




## hand labels
props_l <- round(
    aggregate(
        cbind(social_justice_sum_a, entertainment_sum_a, politics_sum_a) ~ 1,
        subset(
            aggregated_tweets_left,
        ),
        FUN = sum
    ), 2
)
props_l <- props_l / 450

props_r <- round(
    aggregate(
        cbind(social_justice_sum_a, entertainment_sum_a, politics_sum_a) ~ 1,
        subset(
            aggregated_tweets_right,
        ),
        FUN = sum
    ), 2
)
props_r <- props_r / 450



## labeled tweets over time
source("code/10c_plot_tweet_labels_over_time_figures_labeled_tweets.R")

if (!glove_embeddings) {

    ## voter suppression tweets
    source("code/10d_plot_tweet_labels_over_time_figures_suppression.R")

}

print(Sys.time())
